library("readxl")
x_raw <- read_excel("./kcmillersean-billboard-hot-100-1958-2017/original/Hot 100 Audio Features.xlsx")
-
/
sum(is.na(x_raw$spotify_track_explicit))
[1] 4749
x_raw
We eliminate the rows where the target variable is missing.
billboard <- read.csv("./kcmillersean-billboard-hot-100-1958-2017/original/Hot Stuff.csv",header = TRUE)
billboard
x_full <- x_raw[!is.na(x_raw$danceability),]
x_full <- x_full[!is.na(x_full$spotify_track_explicit),]
sum(is.na(x_full$spotify_track_explicit))
[1] 0
x_full
explicit <- x_full$spotify_track_explicit
x <- x_full[,c(9:22)]
x
sum(is.na(x))
[1] 0
We have to see what to do with mode and time_signature.
dim(x)
[1] 23676 14
rr hist(x\(speechiness[x\)spotify_track_explicit == FALSE],freq = FALSE,border = ) hist(x\(speechiness[x\)spotify_track_explicit == TRUE],freq = FALSE,add = TRUE,border = )
C <-cor(x)
C
spotify_track_duration_ms spotify_track_popularity danceability energy
spotify_track_duration_ms 1.00000000 0.198109141 0.092919922 0.12737492
spotify_track_popularity 0.19810914 1.000000000 0.174720923 0.17919595
danceability 0.09291992 0.174720923 1.000000000 0.20443232
energy 0.12737492 0.179195948 0.204432318 1.00000000
key 0.01093583 0.003174794 0.013974729 0.02062887
loudness 0.04015836 0.343989932 0.132717307 0.68614305
mode -0.12642601 -0.115517992 -0.158627624 -0.10393873
speechiness 0.04517899 0.203615939 0.246814282 0.14099577
acousticness -0.29805079 -0.305762683 -0.313068841 -0.58750719
instrumentalness 0.01713666 -0.118529461 0.001627687 -0.00113371
liveness -0.03407297 -0.071495432 -0.129861234 0.11176775
valence -0.15135401 -0.208846077 0.396338665 0.35604178
tempo -0.01634128 0.024300644 -0.151816967 0.16201218
time_signature 0.07397847 0.111737410 0.223424383 0.23011661
key loudness mode speechiness acousticness instrumentalness
spotify_track_duration_ms 0.010935834 0.040158365 -0.12642601 0.04517899 -0.29805079 0.017136662
spotify_track_popularity 0.003174794 0.343989932 -0.11551799 0.20361594 -0.30576268 -0.118529461
danceability 0.013974729 0.132717307 -0.15862762 0.24681428 -0.31306884 0.001627687
energy 0.020628866 0.686143053 -0.10393873 0.14099577 -0.58750719 -0.001133710
key 1.000000000 0.008330687 -0.14400655 0.02637991 -0.02257299 0.003076654
loudness 0.008330687 1.000000000 -0.07977649 0.17164055 -0.40592734 -0.132649016
mode -0.144006548 -0.079776487 1.00000000 -0.13329964 0.14295670 -0.011209945
speechiness 0.026379909 0.171640549 -0.13329964 1.00000000 -0.15490191 -0.056999138
acousticness -0.022572993 -0.405927340 0.14295670 -0.15490191 1.00000000 0.026802890
instrumentalness 0.003076654 -0.132649016 -0.01120994 -0.05699914 0.02680289 1.000000000
liveness -0.002616060 0.043735527 0.01365576 0.07969188 0.04133453 -0.013224803
valence 0.009441598 0.022982663 -0.01981331 -0.02052001 -0.12336474 0.047569447
tempo -0.014142880 0.093918294 0.02027891 0.05687958 -0.10503582 0.003096994
time_signature 0.008179034 0.122478723 -0.05847957 0.08640194 -0.21774293 0.009134502
liveness valence tempo time_signature
spotify_track_duration_ms -0.03407297 -0.151354007 -0.016341278 0.073978466
spotify_track_popularity -0.07149543 -0.208846077 0.024300644 0.111737410
danceability -0.12986123 0.396338665 -0.151816967 0.223424383
energy 0.11176775 0.356041784 0.162012181 0.230116608
key -0.00261606 0.009441598 -0.014142880 0.008179034
loudness 0.04373553 0.022982663 0.093918294 0.122478723
mode 0.01365576 -0.019813314 0.020278909 -0.058479570
speechiness 0.07969188 -0.020520014 0.056879580 0.086401941
acousticness 0.04133453 -0.123364745 -0.105035822 -0.217742931
instrumentalness -0.01322480 0.047569447 0.003096994 0.009134502
liveness 1.00000000 0.024611119 0.019765001 -0.012604036
valence 0.02461112 1.000000000 0.072930576 0.144666917
tempo 0.01976500 0.072930576 1.000000000 -0.017368257
time_signature -0.01260404 0.144666917 -0.017368257 1.000000000
for(i in 1 : (nrow(C) - 1)) {
for(j in (i + 1) : ncol(C)) {
if(abs(C[i, j]) >= 0.5) print(paste0(rownames(C)[i], " - ", colnames(C)[j]))
}
}
[1] "energy - loudness"
[1] "energy - acousticness"
mu_exp
[,1]
spotify_track_duration_ms 2.328391e+05
spotify_track_popularity 6.139210e+01
danceability 7.203289e-01
energy 6.612681e-01
key 5.423867e+00
loudness -6.365256e+00
mode 5.505618e-01
speechiness 1.937759e-01
acousticness 1.422686e-01
instrumentalness 6.431818e-03
liveness 1.990280e-01
valence 5.030620e-01
tempo 1.205554e+02
time_signature 3.998063e+00
mu_nexp = as.matrix(apply(x[explicit == FALSE,],2,mean))
mu_nexp
[,1]
spotify_track_duration_ms 2.199564e+05
spotify_track_popularity 3.800659e+01
danceability 5.833845e-01
energy 6.128126e-01
key 5.216497e+00
loudness -8.999381e+00
mode 7.516473e-01
speechiness 5.689042e-02
acousticness 3.148540e-01
instrumentalness 3.653562e-02
liveness 1.918545e-01
valence 6.177490e-01
tempo 1.201533e+02
time_signature 3.922493e+00
apply(x[explicit == TRUE,],2,var)
spotify_track_duration_ms spotify_track_popularity danceability energy
2.944018e+09 2.114043e+02 1.944620e-02 2.283673e-02
key loudness mode speechiness
1.367608e+01 5.692062e+00 2.475394e-01 1.632485e-02
acousticness instrumentalness liveness valence
3.011615e-02 2.867618e-03 2.262383e-02 4.817320e-02
tempo time_signature
9.131658e+02 7.790322e-02
apply(x[explicit == FALSE,],2,var)
spotify_track_duration_ms spotify_track_popularity danceability energy
4.833257e+09 4.689667e+02 2.184157e-02 4.186070e-02
key loudness mode speechiness
1.254519e+01 1.319797e+01 1.866825e-01 3.255749e-03
acousticness instrumentalness liveness valence
8.287255e-02 2.087088e-02 2.588822e-02 5.636188e-02
tempo time_signature
7.683991e+02 1.053511e-01
Asumim que les variancies son diferents (s’hauria de fer box m test)
sum(explicit == FALSE)
[1] 21095
sum(explicit == TRUE)
[1] 2581
hist(x$loudness[explicit == FALSE],freq = FALSE,border = "green",ylim = c(0,0.2))
hist(x$loudness[explicit == TRUE],freq = FALSE,add = TRUE,border = "red",ylim = c(0,0.2))
hist(x$speechiness[explicit == FALSE],freq = FALSE,border = "green")
hist(x$speechiness[explicit == TRUE],freq = FALSE,add = TRUE,border = "red")
hist(x$danceability[explicit == FALSE],freq = FALSE,border = "green")
hist(x$danceability[explicit == TRUE],freq = FALSE,add = TRUE,border = "red")
hist(x$spotify_track_popularity[explicit == FALSE],freq = FALSE,border = "green",ylim = c(0,0.05))
hist(x$spotify_track_popularity[explicit == TRUE],freq = FALSE,add = TRUE,border = "red",ylim = c(0,0.05))
hist(x$valence[explicit == FALSE],freq = FALSE,border = "green")
hist(x$valence[explicit == TRUE],freq = FALSE,add = TRUE,border = "red")
set.seed(123)
xstd = scale(x,scale = TRUE)
c = sample(1:nrow(xstd),1000)
D = dist(xstd[c,])
hc.ward <- hclust(D,method="ward.D2")
plot(hc.ward,ylab="Distance",main="single linkage (weighted Euclidean)",
xlab="",hang=-1,las=1,cex.main=1)
clusters <- cutree(hc.ward, 2)
table(clusters,explicit[c])
clusters FALSE TRUE
1 629 89
2 277 5
# LDA
library(MASS)
out <- lda(explicit~.,data=x)
plot(out)
pca = princomp(x[c,],cor=TRUE)
plot(pca$scores[,1],pca$scores[,2],col = ifelse(explicit,"red","blue"),asp = 1)
# Creem el dichotomizer
# Primer creem les funcions discriminants
prior_exp = sum(explicit == TRUE)/nrow(x)
prior_nexp = sum(explicit == FALSE)/nrow(x)
S_exp = cov(x[explicit == TRUE,])
S_nexp = cov(x[explicit == FALSE,])
S_exp_inv = solve(S_exp)
S_nexp_inv = solve(S_nexp)
prior_exp
[1] 0.1090133
prior_nexp
[1] 0.8909867
ara ens falta fer learn + test i cross validation
g = function(prior,S_inv,mu,x){
x = t(as.matrix(x))
g = log(prior) - log((2*pi)^(ncol(S)/2)) -1/2*t(x-mu) %*% S_inv %*% (x-mu)
return(g)
}
mal = 0
for (i in 1:nrow(x)){
dico = g(prior_exp,S_exp_inv,mu_exp,x[1,])-g(prior_nexp,S_nexp_inv,mu_nexp,x[1,])
if ((dico < 0 && explicit[i]) || (dico > 0 && !explicit[i])) mal = mal + 1
}
mal/nrow(x)*100 # percentatge errors
[1] 10.90133